R Markdown

This project is trying to investigate if investment in different industry segments would impact on the amount of CO2 emission. Varibales being choosed are CO2 emissions total, CO2 emissions per capita, CO2 emissions per GDP, investment in energy, investment in telecommunication, investment in transportation, investment in water.

  1. Get Data
data<-read_xls("climate_change_download_0.xls",sheet = 1,col_types = "guess",na = "..")
summary(data)
##  Country code       Country name       Series code       
##  Length:13512       Length:13512       Length:13512      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Series name            SCALE         Decimals          1990           
##  Length:13512       Min.   :0      Min.   :0.000   Min.   :-3.000e+03  
##  Class :character   1st Qu.:0      1st Qu.:0.000   1st Qu.: 4.000e+00  
##  Mode  :character   Median :0      Median :1.000   Median : 8.100e+01  
##                     Mean   :0      Mean   :0.581   Mean   : 1.698e+10  
##                     3rd Qu.:0      3rd Qu.:1.000   3rd Qu.: 2.704e+03  
##                     Max.   :0      Max.   :1.000   Max.   : 2.190e+13  
##                     NA's   :3495   NA's   :3495    NA's   :8658        
##       1991                 1992                 1993           
##  Min.   :-8.190e+02   Min.   :-8.040e+02   Min.   :-7.590e+02  
##  1st Qu.: 4.000e+00   1st Qu.: 4.000e+00   1st Qu.: 4.000e+00  
##  Median : 1.000e+02   Median : 1.010e+02   Median : 1.000e+02  
##  Mean   : 2.454e+10   Mean   : 2.503e+10   Mean   : 2.493e+10  
##  3rd Qu.: 4.131e+03   3rd Qu.: 4.220e+03   3rd Qu.: 4.243e+03  
##  Max.   : 2.297e+13   Max.   : 2.452e+13   Max.   : 2.488e+13  
##  NA's   :10015        NA's   :9859         NA's   :9795        
##       1994                 1995                 1996           
##  Min.   :-8.500e+02   Min.   :-7.950e+02   Min.   :-6.800e+02  
##  1st Qu.: 4.000e+00   1st Qu.: 7.000e+00   1st Qu.: 4.000e+00  
##  Median : 1.020e+02   Median : 1.000e+02   Median : 1.070e+02  
##  Mean   : 2.635e+10   Mean   : 2.379e+10   Mean   : 3.009e+10  
##  3rd Qu.: 5.409e+03   3rd Qu.: 5.863e+03   3rd Qu.: 8.394e+03  
##  Max.   : 2.672e+13   Max.   : 2.967e+13   Max.   : 3.028e+13  
##  NA's   :9733         NA's   :8840         NA's   :9708        
##       1997                 1998                 1999           
##  Min.   :-7.280e+02   Min.   :-6.020e+02   Min.   :-5.040e+02  
##  1st Qu.: 4.000e+00   1st Qu.: 5.000e+00   1st Qu.: 6.000e+00  
##  Median : 1.260e+02   Median : 1.160e+02   Median : 1.020e+02  
##  Mean   : 3.046e+10   Mean   : 2.983e+10   Mean   : 2.915e+10  
##  3rd Qu.: 1.559e+04   3rd Qu.: 1.535e+04   3rd Qu.: 8.830e+03  
##  Max.   : 3.019e+13   Max.   : 3.008e+13   Max.   : 3.121e+13  
##  NA's   :9745         NA's   :9694         NA's   :9507        
##       2000                 2001                 2002           
##  Min.   :-5.400e+02   Min.   :-6.360e+02   Min.   :-8.310e+02  
##  1st Qu.: 5.000e+00   1st Qu.: 5.000e+00   1st Qu.: 5.000e+00  
##  Median : 9.300e+01   Median : 1.030e+02   Median : 1.020e+02  
##  Mean   : 2.196e+10   Mean   : 2.995e+10   Mean   : 3.078e+10  
##  3rd Qu.: 3.602e+03   3rd Qu.: 1.053e+04   3rd Qu.: 7.915e+03  
##  Max.   : 3.221e+13   Max.   : 3.201e+13   Max.   : 3.327e+13  
##  NA's   :8016         NA's   :9494         NA's   :9455        
##       2003                 2004                 2005           
##  Min.   :-9.800e+02   Min.   :-1.034e+03   Min.   :-1.028e+03  
##  1st Qu.: 5.000e+00   1st Qu.: 5.000e+00   1st Qu.: 8.000e+00  
##  Median : 1.020e+02   Median : 1.000e+02   Median : 1.000e+02  
##  Mean   : 3.501e+10   Mean   : 3.812e+10   Mean   : 3.471e+10  
##  3rd Qu.: 8.840e+03   3rd Qu.: 6.497e+03   3rd Qu.: 6.348e+03  
##  Max.   : 3.744e+13   Max.   : 4.219e+13   Max.   : 4.562e+13  
##  NA's   :9469         NA's   :9287         NA's   :8428        
##       2006                 2007                 2008           
##  Min.   :-1.015e+03   Min.   :-1.013e+03   Min.   :-1.007e+03  
##  1st Qu.: 5.000e+00   1st Qu.: 5.000e+00   1st Qu.: 6.000e+00  
##  Median : 1.000e+02   Median : 1.010e+02   Median : 9.900e+01  
##  Mean   : 4.585e+10   Mean   : 5.255e+10   Mean   : 5.415e+10  
##  3rd Qu.: 7.716e+03   3rd Qu.: 9.454e+03   3rd Qu.: 5.548e+03  
##  Max.   : 4.945e+13   Max.   : 5.580e+13   Max.   : 6.126e+13  
##  NA's   :9276         NA's   :9264         NA's   :8909        
##       2009                 2010              2011        
##  Min.   :-9.900e+02   Min.   :-3.200e+01   Mode:logical  
##  1st Qu.: 3.000e+00   1st Qu.: 3.000e+00   TRUE:466      
##  Median : 9.200e+01   Median : 1.030e+02   NA's:13046    
##  Mean   : 6.287e+10   Mean   : 1.104e+11                 
##  3rd Qu.: 6.840e+03   3rd Qu.: 4.902e+06                 
##  Max.   : 5.808e+13   Max.   : 6.304e+13                 
##  NA's   :9751         NA's   :11180
  1. Data Cleaning
mydata<-dplyr::filter(data,`Series code` =="IE.PPI.ENGY.CD" |`Series code` == "IE.PPI.TELE.CD"|`Series code` =="IE.PPI.TRAN.CD" |`Series code` =="IE.PPI.WATR.CD"|`Series code` =="EN.ATM.CO2E.PC"|`Series code` =="EN.ATM.CO2E.PP.GD.KD"|`Series code` =="EN.ATM.CO2E.KT")
names(mydata)[1]<-"countrycode"
names(mydata)[2]<-"countryname"
names(mydata)[3]<-"seriescode"
names(mydata)[4]<-"seriesname"
mydata<-mydata %>% as.data.frame() %>% melt(measure.var=7:28)
names(mydata)[7]<-"year"
summary(mydata)
##  countrycode        countryname         seriescode       
##  Length:35882       Length:35882       Length:35882      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   seriesname            SCALE      Decimals           year      
##  Length:35882       Min.   :0   Min.   :0.0000   1990   : 1631  
##  Class :character   1st Qu.:0   1st Qu.:0.0000   1991   : 1631  
##  Mode  :character   Median :0   Median :0.0000   1992   : 1631  
##                     Mean   :0   Mean   :0.4286   1993   : 1631  
##                     3rd Qu.:0   3rd Qu.:1.0000   1994   : 1631  
##                     Max.   :0   Max.   :1.0000   1995   : 1631  
##                                                  (Other):26096  
##      value           
##  Min.   :-3.000e+03  
##  1st Qu.: 8.000e+00  
##  Median : 4.630e+02  
##  Mean   : 4.726e+08  
##  3rd Qu.: 2.167e+05  
##  Max.   : 7.715e+10  
##  NA's   :20700
names(mydata)
## [1] "countrycode" "countryname" "seriescode"  "seriesname"  "SCALE"      
## [6] "Decimals"    "year"        "value"
spreadtest<-tidyr::unite(mydata,"series",c("seriesname","seriescode"),sep="",remove=TRUE,na.rm=FALSE)
spreadtest$SCALE=NULL
spreadtest$Decimals=NULL
spreadtest<-tidyr::spread(spreadtest,series,value)
names(spreadtest)[4]<-"CO2percapita"
names(spreadtest)[5]<-"CO2perGDP"
names(spreadtest)[6]<-"CO2total"
names(spreadtest)[7]<-"investenergy"
names(spreadtest)[8]<-"investtelecom"
names(spreadtest)[9]<-"investtransport"
names(spreadtest)[10]<-"investwater"
selected<-filter(spreadtest,countrycode=="ARG"|countrycode=="BRA"|countrycode=="ARG"|countrycode=="CHL"|countrycode=="CHN"|countrycode=="COL"|countrycode=="IDN"|countrycode=="IND"|
                   countrycode=="MEX"|countrycode=="MYS"|countrycode=="PER"|countrycode=="PHL"|countrycode=="RUS"|countrycode=="THA")
  1. Data Visualization 3.1. CO2 emission for all selected countries
table1<-xtabs(~CO2total+countryname,data=spreadtest)
table1<-aggregate(CO2total~countryname,rm.na=TRUE,data=spreadtest,FUN = sum)
table2<-xtabs(~CO2percapita+countryname,data=spreadtest)
table2<-aggregate(CO2percapita~countryname,rm.na=TRUE,data=spreadtest,FUN=sum)
table3<-xtabs(~CO2perGDP+countryname,data=spreadtest)
table3<-aggregate(CO2perGDP~countryname,rm.na=TRUE,data=spreadtest,FUN = sum)
#total CO2 emission between year 1990 and year 2011 for all countries 
ggplot(spreadtest,aes(x=countryname,y=CO2total,color=year))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#CO2 percapita between year 1990 and year 2011 for all countries
ggplot(spreadtest,aes(x=countryname,y=CO2percapita,color=year))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#CO2 perGDP between year 1990 and year 2011 for all countries
ggplot(selected,aes(x=countryname,y=CO2perGDP,color=year))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#the overall trend of total CO2 emission for all coutries considered as a population
ggplot(spreadtest,aes(x=year,y=CO2total))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#the overall trend of CO2 emission per capita for all coutries considered as a population
ggplot(spreadtest,aes(x=year,y=CO2percapita))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#the overall trend of CO2 emission per GDP for selected countries that have continious investments on different industries
ggplot(selected,aes(x=as.numeric(as.character(year)),y=CO2perGDP,color=countryname))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#the CO2 emission per GDP between year 1990 and year 2011 for selected countries
plot_ly(selected,x=~as.numeric(as.character(year)),y=~CO2perGDP,type='scatter',mode='lines')
#the CO2 total emission between year 1990 and year 2011 for selected countries
plot_ly(selected,x=~as.numeric(as.character(year)),y=~CO2total,type='scatter',mode='lines')
#the CO2 emission per capita between year 1990 and year 2011 for selected countries
plot_ly(selected,x=~as.numeric(as.character(year)),y=~CO2percapita,type='scatter',mode='lines')

3.2. Top emission from country (CO2 emission per GDP)

clean<-na.omit(spreadtest)
topemission<-arrange(table3,CO2perGDP)
ggplot(head(topemission,n=10),aes(countryname,CO2perGDP,fill=countryname,label=paste(CO2perGDP)))+geom_bar(sta='identity',color='black')+
  coord_flip()+theme_bw()+theme(legend.position = 'none')+geom_text(aes(y=1),hjust=0,vjust=0.5,size=4)+
  labs(title='Top countries with highest CO2 emission per GDP')

3.3 Comparing Investment in Different Segments with CO2 Emission for Each Selected Country

ARG<-filter(selected,countrycode=="ARG")
plot_ly(ARG,x=~ARG$year,y=~ARG$CO2percapita,name="CO2percapita",type='scatter',mode='lines') %>% 
  add_trace(y=~scale(ARG$investenergy),name='energy',mode="lines+markers") %>% 
  add_trace(y=~scale(ARG$investtelecom),name='telecom',mode='lines+markers') %>% 
  add_trace(y=~scale(ARG$investtransport),name='transport',mode='lines+markers')

3.4 Use the 13 Countries as a Sample to Test if Investment in Different Segments would Impact the Amount of CO2 Emission

sample<-na.omit(selected[3:10])
trace_0<-aggregate(sample$CO2percapita,by=list(year=sample$year),FUN=sum)
trace_1<-aggregate(sample$investenergy,by=list(year=sample$year),FUN=sum)
trace_2<-aggregate(sample$investtelecom,by=list(year=sample$year),FUN=sum)
trace_3<-aggregate(sample$investtransport,by=list(year=sample$year),FUN=sum)
plot_ly(sample,x=~trace_0$year,y=~trace_0$x,name="CO2percapita",type='scatter',mode='lines') %>% 
  add_trace(y=~scale(trace_1$x),name='energy',mode='lines+markers') %>% 
  add_trace(y=~scale(trace_2$x),name='telecom',mode='lines+markers') %>% 
  add_trace(y=~scale(trace_3$x),name='transport',mode='lines+markers')
cor.test(sample$CO2percapita,sample$investenergy)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 0.52495, df = 109, p-value = 0.6007
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1374618  0.2344165
## sample estimates:
##        cor 
## 0.05021779
cor.test(sample$CO2percapita,sample$investtelecom)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -0.39648, df = 109, p-value = 0.6925
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2227658  0.1495018
## sample estimates:
##         cor 
## -0.03794859
cor.test(sample$CO2percapita,sample$investtransport)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -0.85622, df = 109, p-value = 0.3938
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2641061  0.1062751
## sample estimates:
##         cor 
## -0.08173693

Based on both the EDA and the correlation test, investment in difference segments does not have significant impact on CO2 emission. Limitations would be: 1. The 13 countries, as a sample, could be not representative regarding the whole world. 2. Simple linear coefficient model shoule be validated using residual check.